using System;
using System.Collections.Concurrent;
using System.Threading.Tasks;
using size_t = nuint;

unsafe class ParallelMatrixMultiplier<TSingleMul> : IMatrixMultiplier
	where TSingleMul : IMatrixMultiplier, new()
{
	ConcurrentBag<TSingleMul> bag;
	ParallelOptions options;
	private bool disposedValue;

	bool tryLargePage;
	public bool TryLargePage
	{
		get => tryLargePage;
		set
		{
			tryLargePage = value;
			foreach (var mul in bag)
				mul.TryLargePage = value;
		}
	}

	TSingleMul PopulateSingle()
	{
		var mul = new TSingleMul();
		mul.TryLargePage = TryLargePage;
		return mul;
	}

	public ParallelMatrixMultiplier(int threadNum)
	{
		if (threadNum < -1 || threadNum == 0)
			throw new ArgumentException();
		threadNum = threadNum > 0 ? threadNum : Environment.ProcessorCount;

		options = new ParallelOptions();
		options.MaxDegreeOfParallelism = threadNum;
		bag = new ConcurrentBag<TSingleMul>();
		while (threadNum-- > 0)
			bag.Add(PopulateSingle());
	}

	public void Multiply(bool add, double* C, double* A, double* B, size_t C_rows, size_t C_cols, size_t A_cols, size_t C_stride, size_t A_stride, size_t B_stride)
	{
		if (disposedValue)
			throw new ObjectDisposedException("");
		if (C_rows == 0 || C_cols == 0)
			return;

		size_t divideNum = (size_t)options.MaxDegreeOfParallelism;
		size_t C_sub_rows = (C_rows + divideNum - 1) / divideNum;

		// sometimes (divideNum-1)*C_sub_rows >= C_rows, because C_rows is too small
		if (C_rows <= (divideNum - 1) * (divideNum - 1))
			divideNum = (C_rows + C_sub_rows - 1) / C_sub_rows;

		Parallel.For(0, (int)divideNum, options, i =>
		{
			double* prod_sub = C + (uint)i * C_sub_rows * C_stride;
			double* A_sub = A + (uint)i * C_sub_rows * A_stride;
			size_t C_sub_rows_safe = i < (int)divideNum - 1 ? C_sub_rows : C_rows - (uint)i * C_sub_rows;

			if (!bag.TryTake(out TSingleMul mul))
				mul = PopulateSingle();
			mul.Multiply(add, prod_sub, A_sub, B,
						 C_sub_rows_safe, C_cols, A_cols,
						 C_stride, A_stride, B_stride);
			bag.Add(mul);
		});
	}

	protected virtual void Dispose(bool disposing)
	{
		if (!disposedValue)
		{
			if (disposing)
			{
				while (bag.TryTake(out TSingleMul mul))
					mul.Dispose();
			}
			bag = null;
			options = null;
			disposedValue = true;
		}
	}

	public void Dispose()
	{
		Dispose(disposing: true);
		GC.SuppressFinalize(this);
	}
}
